years_1 <- c(1900:2012, 2014)
years_2 <- c(2015:2019)
importing_data = function(x){
if(str_detect(x, str_c(years_1, collapse = "|"))) {
read_csv(x, na = c("NULL", "", "0"), col_types = "cicccciiiicc")
}
else if(str_detect(x, str_c(years_2, collapse = "|"))){
read_csv(x, na = c("NULL", "", "0"), col_types = "cccicccccccccccccccccciiiiccc")
}
}
boston_df <-
tibble(list.files("data", full.names = TRUE)) %>%
setNames("file_name") %>%
mutate(data = map(file_name, importing_data)) %>%
unnest(data) %>%
mutate(year = readr::parse_number(file_name),
city = coalesce(city, residence),
display_name = str_replace_all(display_name, "[^a-zA-Z0-9]", " ")) %>%
filter(!is.na(display_name)) %>%
select(-file_name, -residence, -first_name, -last_name)We were interested in creating a map with the locations where winners from the Boston marathon over the past 120 year are from. Additionally, we wanted to examine how the locations of winners changed over time. We also analyzed data from winners from the wheel chair division and compared results to the men and women open divsion.
boston_df2 = boston_df %>%
filter(year > 1999) %>%
filter(overall == 1) %>%
filter(gender == "M") %>%
select(year, city, state, overall, everything()) %>%
drop_na(city) %>%
separate(city, into = c("city", "state", "country"), sep = ",") %>%
select(-country, -state) %>%
writexl::write_xlsx("interactive_map_men.xlsx")
boston_df3 = boston_df %>%
filter(year > 1999) %>%
filter(gender_result == 1) %>%
filter(gender == "F") %>%
select(year, city, state, overall, everything()) %>%
drop_na(city) %>%
writexl::write_xlsx("interactive_map_women.xlsx")map_df = read_excel("data/latitude_longitude_winners.xlsx", sheet = 1) %>%
select(year, city, latitude, longitude, age, gender, official_time, display_name) %>%
rename(place = city)
map_df2 = read_excel("data/latitude_longitude_winners.xlsx", sheet = 2) %>%
select(year, city, latitude, longitude, age, gender, official_time, display_name) %>%
rename(place = city) men_open = read_excel("data/geo_winners.xlsx", sheet = 4) %>%
janitor::clean_names() %>%
rename(place = country) %>%
separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>%
select(-data)
lat_long = read_excel("data/lat_long.xlsx") %>%
select(-country) %>%
rename(place = name)
men_merge <- merge(men_open,lat_long,by="place") %>%
rename(display_name = name) %>%
mutate(gender = "M")
men_merge2 = men_merge %>%
mutate(age = NA) %>%
filter(year > 1900) %>%
filter(year < 2000)
men_total = rbind(men_merge2, map_df)
women_open = read_excel("data/geo_winners.xlsx", sheet = 3) %>%
janitor::clean_names() %>%
rename(place = country) %>%
separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>%
select(-data)
women_merge = merge(women_open, lat_long, by="place") %>%
rename(display_name = name) %>%
mutate(gender = "F")
women_merge2 = women_merge %>%
mutate(age = NA) %>%
filter(year > 1900) %>%
filter(year < 2000)
woman_total = rbind(women_merge2, map_df2) %>%
drop_na(latitude)women_wheelchair = read_excel("data/geo_winners.xlsx", sheet = "women_wheel_chair") %>%
janitor::clean_names() %>%
separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>%
select(-data) %>%
mutate(gender = "F")
men_wheelchair = read_excel("data/geo_winners.xlsx", sheet = "men_wheel_chair") %>%
janitor::clean_names() %>%
separate(official_time, into = c("data", "official_time"), sep = " (?=[^ ]+$)") %>%
select(-data) %>%
mutate(gender = "M")
wheelchair_merge = rbind(women_wheelchair, men_wheelchair) %>%
rename(place = country)
wheelchair_total = merge(wheelchair_merge, lat_long, by="place") %>%
filter(!(year == 2013))map_winners = leaflet(men_total) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addMarkers(lat = ~latitude, lng = ~longitude,
popup = paste("Name:", men_total$display_name, "<br>", "Year:", men_total$year,"<br>", "Official Time:", men_total$official_time, "<br>", "Age:", men_total$age, "<br>", "Gender:", men_total$gender), clusterOptions = markerClusterOptions())
map_winnersThe continent with the largest nuber of male winners from 1900 and beyond is North American, specifically the US, followed by Africa. There are 44 winners from the United States, 20 from Kenya, 10 from Japan.
map_winners_women = leaflet(woman_total) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addMarkers(lat = ~latitude, lng = ~longitude,
popup = paste("Name:", woman_total$display_name, "<br>", "Year:", woman_total$year,"<br>", "Official Time:", woman_total$official_time, "<br>", "Age:", woman_total$age, "<br>", "Gender:", woman_total$gender), clusterOptions = markerClusterOptions())
map_winners_womenThis is in contrast to female winners from 1900 and beyond where Africa is the continent with the most winners, closely followed by Europe, and North America. Most of the female winners in Africa are from Kenya. The map for female winners suggests that a participants proximinity to Boston likely does not play a role in their probability of winning the marathon.
winners_bind = rbind(women_merge, men_merge)
plot2 = winners_bind %>%
plot_ly(
x = ~year, y = ~official_time, color = ~place,
type = "scatter") %>%
layout(
title = "Official Times of Male and Female Winners each Year by Winner's Country")
plot2Over time, the winning time for both male and female winners has declined. While prior to 1950, most of the winners were from the United States, most of the winners are from Kenya. Additionally, we can see that the lowest marathon times have been from people from Kenya.
map_wheelchair = leaflet(wheelchair_total) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addMarkers(lat = ~latitude, lng = ~longitude,
popup = paste("Name:", wheelchair_total$name, "<br>", "Year:", wheelchair_total$year,"<br>", "Official Time:", wheelchair_total$official_time, "<br>", "Gender:", wheelchair_total$gender), clusterOptions = markerClusterOptions())
map_wheelchairWhen assessing wheel chair division map, we can see that 25 winners are from the United States, followed by 19 from Europe, and 10 from Africa. While the male and female open division lack winners from Switzerland, there are 15 winners in the wheel chair division from Switzerland.
plot3 = wheelchair_total %>%
plot_ly(
x = ~year, y = ~official_time, color = ~place,
type = "scatter") %>%
layout(
title = "Official Times of Winners each Year in Wheel Chair Division")
plot3While there is a downward trend in official times for the open division, there is no trend in official times over time. Many of the winners pre-2000 are from the United States while many of the winners post-2000 are from South Africa. The lowest time recorded is from an individual in Switzerland.